{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "4c074d28-ac05-4fb4-92ce-4f62e968d36d",
   "metadata": {},
   "source": [
    "# Connect to Elasticsearch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "4f6ddf8b-dc72-4ea7-a942-5d2352e038e7",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'name': 'liuxgm.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'SXGzrN4dSXW1t0pkWXGfjg', 'version': {'number': '8.11.0', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': 'd9ec3fa628c7b0ba3d25692e277ba26814820b20', 'build_date': '2023-11-04T10:04:57.184859352Z', 'build_snapshot': False, 'lucene_version': '9.8.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}\n"
     ]
    }
   ],
   "source": [
    "from elasticsearch import Elasticsearch\n",
    "import os\n",
    " \n",
    "elastic_user=os.getenv('ES_USER')\n",
    "elastic_password=os.getenv('ES_PASSWORD')\n",
    "elastic_endpoint=os.getenv(\"ES_ENDPOINT\")\n",
    " \n",
    "url = f\"https://{elastic_user}:{elastic_password}@{elastic_endpoint}:9200\"\n",
    "es = Elasticsearch(url, ca_certs = \"./http_ca.crt\", verify_certs = True)\n",
    " \n",
    "print(es.info())"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "e1c3089c-e74d-42c5-b7df-707ccf4874a7",
   "metadata": {},
   "source": [
    "# Download and Deploy ELSER Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "cc56b525-0a68-4601-8fdc-6fad6bedadaa",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Model deleted successfully, We will proceed with creating one\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'model_id': '.elser_model_2', 'model_type': 'pytorch', 'model_package': {'packaged_model_id': 'elser_model_2', 'model_repository': 'https://ml-models.elastic.co', 'minimum_version': '11.0.0', 'size': 438123914, 'sha256': '2e0450a1c598221a919917cbb05d8672aed6c613c028008fedcd696462c81af0', 'metadata': {}, 'tags': [], 'vocabulary_file': 'elser_model_2.vocab.json'}, 'created_by': 'api_user', 'version': '11.0.0', 'create_time': 1703833983809, 'model_size_bytes': 0, 'estimated_operations': 0, 'license_level': 'platinum', 'description': 'Elastic Learned Sparse EncodeR v2', 'tags': ['elastic'], 'metadata': {}, 'input': {'field_names': ['text_field']}, 'inference_config': {'text_expansion': {'vocabulary': {'index': '.ml-inference-native-000002'}, 'tokenization': {'bert': {'do_lower_case': True, 'with_special_tokens': True, 'max_sequence_length': 512, 'truncate': 'first', 'span': -1}}}}, 'location': {'index': {'name': '.ml-inference-native-000002'}}})"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# delete model if already downloaded and deployed\n",
    "try:\n",
    "  es.ml.delete_trained_model(model_id=\".elser_model_2\",force=True)\n",
    "  print(\"Model deleted successfully, We will proceed with creating one\")\n",
    "except exceptions.NotFoundError:\n",
    "  print(\"Model doesn't exist, but We will proceed with creating one\")\n",
    "\n",
    "# Creates the ELSER model configuration. Automatically downloads the model if it doesn't exist. \n",
    "es.ml.put_trained_model(\n",
    "    model_id=\".elser_model_2\",\n",
    "    input={\n",
    "      \"field_names\": [\"text_field\"]\n",
    "    }\n",
    "  )\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "ee9404a1-8bb9-4521-a998-260cfe159f5c",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded but not ready to be deployed.\n",
      "ELSER Model is downloaded and ready to be deployed.\n"
     ]
    }
   ],
   "source": [
    "while True:\n",
    "    status = es.ml.get_trained_models(\n",
    "        model_id=\".elser_model_2\",\n",
    "        include=\"definition_status\"\n",
    "    )\n",
    "    \n",
    "    if (status[\"trained_model_configs\"][0][\"fully_defined\"]):\n",
    "        print(\"ELSER Model is downloaded and ready to be deployed.\")\n",
    "        break\n",
    "    else:\n",
    "        print(\"ELSER Model is downloaded but not ready to be deployed.\")\n",
    "    time.sleep(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "d6468e49-be96-4a1e-bf18-f6196f7061cd",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'assignment': {'task_parameters': {'model_id': '.elser_model_2', 'deployment_id': '.elser_model_2', 'model_bytes': 438123914, 'threads_per_allocation': 1, 'number_of_allocations': 1, 'queue_capacity': 1024, 'cache_size': '438123914b', 'priority': 'normal', 'per_deployment_memory_bytes': 0, 'per_allocation_memory_bytes': 0}, 'routing_table': {'Wfv6REEYTky3JEV-lnLNHQ': {'current_allocations': 1, 'target_allocations': 1, 'routing_state': 'starting', 'reason': ''}}, 'assignment_state': 'starting', 'start_time': '2023-12-29T07:16:33.92542Z', 'max_assigned_allocations': 1}})"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import time\n",
    "\n",
    "# Start trained model deployment if not already deployed\n",
    "es.ml.start_trained_model_deployment(\n",
    "  model_id=\".elser_model_2\",\n",
    "  number_of_allocations=1,\n",
    "  wait_for=\"starting\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "9c378714-d592-4a3f-a1af-2f895820347d",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ELSER Model has been successfully deployed.\n"
     ]
    }
   ],
   "source": [
    "while True:\n",
    "  status = es.ml.get_trained_models_stats(\n",
    "    model_id=\".elser_model_2\",\n",
    "  )\n",
    "  if (status[\"trained_model_stats\"][0][\"deployment_stats\"][\"state\"] == \"started\"):\n",
    "    print(\"ELSER Model has been successfully deployed.\")\n",
    "    break\n",
    "  else:\n",
    "    print(\"ELSER Model is currently being deployed.\")\n",
    "  time.sleep(5)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1fa1a22d-f5cc-4365-9c08-8e882010e075",
   "metadata": {},
   "source": [
    "# Indexing Documents with ELSER"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "a26488ac-bf04-48e9-8dcc-f4408393b1b1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True})"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.ingest.put_pipeline(\n",
    "    id=\"elser-ingest-pipeline\", \n",
    "    description=\"Ingest pipeline for ELSER\",\n",
    "    processors=[\n",
    "    {\n",
    "      \"inference\": {\n",
    "        \"model_id\": \".elser_model_2\",\n",
    "        \"input_output\": [\n",
    "            {\n",
    "              \"input_field\": \"plot\",\n",
    "              \"output_field\": \"plot_embedding\"\n",
    "            }\n",
    "          ]\n",
    "      }\n",
    "    }\n",
    "  ]\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6b486b0d-e84d-4ed1-a80a-618c0cae92b9",
   "metadata": {},
   "source": [
    "# Create index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "d6888f13-5abe-4307-8142-4fbab6330be3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'elser-example-movies'})"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "es.indices.delete(index=\"elser-example-movies\", ignore_unavailable=True)\n",
    "es.indices.create(\n",
    "  index=\"elser-example-movies\",\n",
    "  settings={\n",
    "      \"index\": {\n",
    "          \"default_pipeline\": \"elser-ingest-pipeline\"\n",
    "      }\n",
    "  },\n",
    "  mappings={\n",
    "    \"properties\": {\n",
    "      \"plot\": {\n",
    "        \"type\": \"text\",\n",
    "        \"fields\": {\n",
    "          \"keyword\": {\n",
    "            \"type\": \"keyword\",\n",
    "            \"ignore_above\": 256\n",
    "          }\n",
    "        }\n",
    "      },\n",
    "      \"plot_embedding\": { \n",
    "        \"type\": \"sparse_vector\" \n",
    "      }\n",
    "    }\n",
    "  }\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "54d39faa-79d6-4c49-846d-a47a6e2d06d4",
   "metadata": {},
   "source": [
    "# Insert documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "ab859353-04ed-4919-bed6-40e976d26345",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Done indexing documents into `elser-example-movies` index!\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from elasticsearch import helpers\n",
    " \n",
    "with open('movies.json') as f:\n",
    "   data_json = json.load(f)\n",
    " \n",
    "# Prepare the documents to be indexed\n",
    "documents = []\n",
    "for doc in data_json:\n",
    "    documents.append({\n",
    "        \"_index\": \"elser-example-movies\",\n",
    "        \"_source\": doc,\n",
    "    })\n",
    " \n",
    "# Use helpers.bulk to index\n",
    "helpers.bulk(es, documents)\n",
    " \n",
    "print(\"Done indexing documents into `elser-example-movies` index!\")\n",
    "time.sleep(3)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6dc01836-68b7-43d2-8e95-a5c3a1c296e7",
   "metadata": {},
   "source": [
    "# Searching documents"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "707515d8-bd98-4ccf-b750-c3157a6ebe07",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Score: 12.763341\n",
      "Title: Fight Club\n",
      "Plot: An insomniac office worker and a devil-may-care soapmaker form an underground fight club that evolves into something much, much more.\n",
      "\n",
      "Score: 9.930414\n",
      "Title: Pulp Fiction\n",
      "Plot: The lives of two mob hitmen, a boxer, a gangster and his wife, and a pair of diner bandits intertwine in four tales of violence and redemption.\n",
      "\n",
      "Score: 9.488333\n",
      "Title: The Matrix\n",
      "Plot: A computer hacker learns from mysterious rebels about the true nature of his reality and his role in the war against its controllers.\n",
      "\n"
     ]
    }
   ],
   "source": [
    "response = es.search(\n",
    "    index='elser-example-movies', \n",
    "    size=3,\n",
    "    query={\n",
    "        \"text_expansion\": {\n",
    "            \"plot_embedding\": {\n",
    "                \"model_id\":\".elser_model_2\",\n",
    "                \"model_text\":\"fighting movie\"\n",
    "            }\n",
    "        }\n",
    "    }\n",
    ")\n",
    "\n",
    "for hit in response['hits']['hits']:\n",
    "    doc_id = hit['_id']\n",
    "    score = hit['_score']\n",
    "    title = hit['_source']['title']\n",
    "    plot = hit['_source']['plot']\n",
    "    print(f\"Score: {score}\\nTitle: {title}\\nPlot: {plot}\\n\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "66ca2ec0-d77c-4d1b-85df-da885330199c",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python_3.11",
   "language": "python",
   "name": "python_3.11"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
